Corpus preparation

We use here data from Media cloud with only title of news. Therfore the program is more simple than in course.

Selection of media

# Load data with the function fread (fast) and the encoding UTF-8
df1<-fread("data/source/en_GBR_guardi_2019_2020.csv", encoding = "UTF-8")
df1$media<-"en_GBR_guardi"


df2<-fread("data/source/en_GBR_telegr_2019_2020.csv", encoding = "UTF-8")
df2$media<-"en_GBR_telegr"


# transform in data.table format
df<-rbind(df1,df2)
rm(df1,df2)


# select column of interest
df$id <- df$stories_id
df$who <- df$media
df$when <- as.Date(df$publish_date)
df$text <- df$title
df<-df[,c("id","who","when","text")]
df<-df[order(when),]

# select period of interest
mintime<-as.Date("2019-01-01")
maxtime<-as.Date("2020-12-31")
df<-df[(is.na(df$when)==F),] 
df<-df[as.Date(df$when) >= mintime,]
df<-df[as.Date(df$when) <= maxtime,]

# eliminate duplicate
df<-df[duplicated(df$text)==F,]

Check of time frequency

Time divisions

We transform the previous data.frame in a data.table format for easier operations of aggregation

dt<-as.data.table(df)
dt$day     <- as.Date(dt$when)
dt$week    <- cut(dt$day, "weeks", start.on.monday=TRUE)
dt$month   <- cut(dt$day, "months")
dt$weekday <- weekdays(dt$day)

# Save data frame
saveRDS(dt,"data/corpus/dt_mycorpus.RDS") 

News by week

We examine if the distribution is regular by week for the different media of the corpus.

dt<-readRDS("data/corpus/dt_mycorpus.RDS")
news_weeks<-dt[,.(newstot=.N),by=.(week,who)]

p<-ggplot(news_weeks, aes(x=as.Date(week),y=newstot, col=who))+
   geom_line()+
   geom_smooth(method = 'loess', formula = 'y~x')+
   scale_y_continuous("Number of news", limits = c(0,NA)) +
   scale_x_date("Week (starting on monday)") +
         ggtitle(label ="Corpus : distribution of news by week")
p

News by weekday

We examine if the distribution is regular by weekday and check in particular the effect of the week-end.

#compute frequencies by weekday
news_weekdays<-dt[,.(newstot=.N),by=.(weekday,who)]
news_weekdays<-news_weekdays[,.(weekday,newspct=100*newstot/sum(newstot)),by=.(who)]


# Translate weekdays in english and order
news_weekdays$weekday<-as.factor(news_weekdays$weekday)
levels(news_weekdays$weekday)<-c("7.Sunday","4.Wednesday","1.Monday","2.Tuesday","3.Thursday","6.Sathurday","5.Friday")
news_weekdays$weekday<-as.factor(as.character(news_weekdays$weekday))
news_weekdays<-news_weekdays[order(news_weekdays$weekday),]


p<-ggplot(news_weekdays, aes(x=weekday,fill = who, y=newspct))+
         geom_bar(position = "dodge", stat="identity")+
         scale_y_continuous("Share of news (%)", limits = c(0,NA)) +
         ggtitle(label ="Corpus : distribution of news by week day")
p

Transform in quanteda corpus


# transform in quanteda
qd<-corpus(dt,docid_field = "id",text_field = "text")


# filter by number of tokens by sentence
qd$nbt<-ntoken(texts(qd))
qd<-corpus_subset(qd, nbt<100)
qd<-corpus_subset(qd, nbt>2)



# Save corpus in qd format
saveRDS(qd,"data/corpus/qd_mycorpus.RDS")



head(qd)
Corpus consisting of 6 documents and 7 docvars.
1128777462 :
"Apprehension on all sides before launch of Irish abortion se..."

1128777446 :
"Toxic legacy taints ANC as it nears 25-year rule in South Af..."

1128795170 :
"Preparations for the Harbin ice and snow festival – in pictu..."

1128795156 :
"'Resign from Facebook': experts offer Mark Zuckerberg advice..."

1128795141 :
"Gender pay gap: 2018 brought transparency – will 2019 bring ..."

1128795146 :
"It's a thong thing: why £16 flip-flops are the shoe of summe..."
summary(qd,3)
Corpus consisting of 103386 documents, showing 3 documents:

       Text Types Tokens Sentences           who       when        day       week      month
 1128777462    10     10         1 en_GBR_guardi 2019-01-01 2019-01-01 2018-12-31 2019-01-01
 1128777446    12     12         1 en_GBR_guardi 2019-01-01 2019-01-01 2018-12-31 2019-01-01
 1128795170    11     11         1 en_GBR_guardi 2019-01-01 2019-01-01 2018-12-31 2019-01-01
 weekday nbt
   Mardi  10
   Mardi  12
   Mardi  11

Geographical tags

Preparation of data

Load dictonary

We start by loading the last version of the Imageun dictionary and we extract our target language (here : english).

# Load multilanguage dictionary
dict<-fread("data/dico_states/global_state_V2.csv")

# Select french dictionary
dict <- dict[dict$lang=="en",]


head(dict)

Load corpus

qd <- readRDS("data/corpus/qd_mycorpus.RDS")

Load tagging function

extract_tags <- function(qd = qd,                      # the corpus of interest
                         lang = "fr",                  # the language to be used
                         dict = dict,                  # the dictionary of target 
                         code = "ISO3" ,                # variable used for coding
                         alias = "x",                   # variable used for alias
                         tagsname = "states",           # name of the tags column
                         split  = c("'","’","-"),       # split list
                         tolow = TRUE  ,                # Tokenize text
                         comps = c("Afrique du sud")  # compounds
                         )
{ 


  
# Tokenize  
x<-as.character(qd)


if(length(split) > 0) { reg<-paste(split, collapse = '|')
                       x <- gsub(reg," ",x)}  
if(tolow) { x <- tolower(x)} 
toks<-tokens(x)

# compounds
if(length(split) > 0) { reg<-paste(split, collapse = '|')
                       comps<- gsub(reg," ",comps)}  
if(tolow)       {comps <- tolower(comps)}  
toks<-tokens_compound(toks,pattern=phrase(comps))

  
# Load dictionaries and create compounds

  ## Target dictionary

labels <-dict[[alias]]
if(length(split) > 0) { reg<-paste(split, collapse = '|')
                       labels<- gsub(reg," ",labels)}  
if(tolow)       {labels <- tolower(labels)}  
toks<-tokens_compound(toks,pattern=phrase(labels))
  
 # create quanteda dictionary
keys <-gsub(" ","_",labels)
qd_dict<-as.list(keys)
names(qd_dict)<-dict[[code]]
qd_dict<-dictionary(qd_dict,tolower = FALSE)

# Identify geo tags (states or reg or org ...)
toks_tags <- tokens_lookup(toks, qd_dict, case_insensitive = F)
toks_tags <- lapply(toks_tags, unique)
toks_tags<-as.tokens(toks_tags)
list_tags<-function(x){res<-paste(x, collapse=' ')}
docvars(qd)[[tagsname]]<-as.character(lapply(toks_tags,FUN=list_tags))
docvars(qd)[[paste("nb",tagsname,sep="")]]<-ntoken(toks_tags)



# Export results
return(qd)
 }

Geographical annotation

Annotate all entities

t1<-Sys.time()

frcomps<-c("China sea", "indian ocean", "american continent", "american countries")

qd <- extract_tags (qd = qd,
                     lang="en",
                     dict = dict,
                     code = "ISO3",
                     alias = "x",
                     tagsname = "states",
                     split = c("'","’","-"),
                     comps = frcomps,
                     tolow = TRUE)

t2 = Sys.time()
paste("Program executed in ", t2-t1)

table(qd$nbstates)

check news with maximum state number

table(qd$nbstates)

    0     1     2     3     4     5 
76465 23942  2752   195    30     2 
check<-corpus_subset(qd,nbstates>3)
x<-data.frame(who=check$who,when = check$when,text=as.character(check),states=check$states,nbstates=check$nbstates)
x<-x[order(x$nbstates,decreasing = T),]
kable(x)
who when text states nbstates
1400389059 en_GBR_guardi 2019-09-23 France, Germany and UK blame Iran for Saudi oilfield attack FRA DEU GBR IRN SAU 5
1694147850 en_GBR_telegr 2020-08-27 travel news quarantine italy greece france switzerland uk restrictions ITA GRC FRA CHE GBR 5
1282834562 en_GBR_guardi 2019-05-14 No Iran threat in Syria or Iraq, top British officer says, contradicting US IRN SYR IRQ GBR 4
1313975713 en_GBR_guardi 2019-06-17 South Korea v Norway, Nigeria v France: Women’s World Cup clockwatch – live! KOR NOR NGA FRA 4
1315886333 en_GBR_guardi 2019-06-19 UK, France and Germany in last-ditch effort to save Iran deal GBR FRA DEU IRN 4
1384226853 en_GBR_guardi 2019-09-05 Euro 2020: Republic of Ireland v Switzerland, Romania v Spain – live! IRL CHE ROU ESP 4
1386655906 en_GBR_guardi 2019-09-08 Euro 2020 qualifying: Sweden v Norway, Finland v Italy and more – live! SWE NOR FIN ITA 4
1419597904 en_GBR_guardi 2019-10-15 Switzerland v Republic of Ireland, Sweden v Spain: Euro 2020 – live! CHE IRL SWE ESP 4
1517113825 en_GBR_telegr 2019-12-27 Iran, Russia and China carry out naval drills in Indian Ocean IRN RUS CHN IND 4
1517033930 en_GBR_telegr 2020-01-03 ‘Westerners should leave UAE immediately’: Gulf warning as British troops put at greater risk in Iraq after US kills Iran military chief ARE GBR IRQ IRN 4
1597961952 en_GBR_telegr 2020-03-31 Britain, France and Germany bypass US sanctions to provide Iran with medical aid GBR FRA DEU IRN 4
1682896357 en_GBR_telegr 2020-07-22 Australia joins US and Japan for navy drills in the Philippine Sea as concerns grow over China AUS JPN PHL CHN 4
1738190141 en_GBR_telegr 2020-07-26 France and Germany could join Spain on UK’s quarantine list FRA DEU ESP GBR 4
1671038069 en_GBR_telegr 2020-07-28 Chinese foreign ministry to suspend ‘mutual legal assistance’ in criminal matters between Hong Kong and the UK, Australia and Canada CHN GBR AUS CAN 4
1708910479 en_GBR_telegr 2020-08-06 UK quarantine imposed for travellers from Belgium, Andorra and the Bahamas GBR BEL AND BHS 4
1683099252 en_GBR_guardi 2020-08-14 Iran and Turkey denounce UAE over deal with Israel IRN TUR ARE ISR 4
1687591712 en_GBR_guardi 2020-08-19 Australian pilots drafted to help fly UK drones over Syria and Iraq AUS GBR SYR IRQ 4
1703180856 en_GBR_telegr 2020-08-25 Switzerland to have Covid quarantine re-imposed as Czech republic, Jamaica and Iceland enter danger zone CHE CZE JAM ISL 4
1695194722 en_GBR_telegr 2020-08-28 Travel news: Switzerland, Czech Republic and Jamaica out, but Cuba in following quarantine update CHE CZE JAM CUB 4
1707661716 en_GBR_guardi 2020-09-10 UK, France and Germany agree to reject US demand for Iran snapback sanctions GBR FRA DEU IRN 4
1719242458 en_GBR_guardi 2020-09-23 UK, France and Germany summon Iranian ambassadors over prisoners GBR FRA DEU IRN 4
1766046522 en_GBR_telegr 2020-09-27 UAE, Bahrain defend ties with Israel as Palestinians cry betrayal ARE BHR ISR PSE 4
1726187015 en_GBR_telegr 2020-10-01 Travel latest news: Italy, Greece, Sweden and Poland face the chop as quarantine decision looms ITA GRC SWE POL 4
1775693184 en_GBR_telegr 2020-10-01 Turkey and Poland put on quarantine list as Greece and Italy escape TUR POL GRC ITA 4
1737834173 en_GBR_guardi 2020-10-13 Nations League: Ukraine shock Spain, Germany and Switzerland draw thriller UKR ESP DEU CHE 4
1760080346 en_GBR_guardi 2020-11-04 Coronavirus live news: England to begin lockdown as Russia, Poland, Switzerland, Austria see record case rises RUS POL CHE AUT 4
1766818486 en_GBR_guardi 2020-11-11 Netherlands v Spain, Germany v Czech Republic and more: international football – live! NLD ESP DEU CZE 4
1767020618 en_GBR_guardi 2020-11-11 International friendly roundup: Finland stun France and Belgium edge Swiss FIN FRA BEL CHE 4
1772280029 en_GBR_guardi 2020-11-17 Spain v Germany, Croatia v Portugal and more: Nations League – live! ESP DEU HRV PRT 4
1777984778 en_GBR_guardi 2020-11-23 UK, France and Germany discuss working with Joe Biden on Iran nuclear deal GBR FRA DEU IRN 4
1784491580 en_GBR_guardi 2020-12-01 France and New Zealand join Australia’s criticism of Chinese government tweet FRA NZL AUS CHN 4
1802693210 en_GBR_guardi 2020-12-20 Belgium, Italy and Netherlands ban flights from UK over new Covid strain BEL ITA NLD GBR 4

Save geographically anotated corpus

saveRDS(qd,"data/corpus/qd_mycorpus_states.RDS")
paste("Size of resulting file = ",round(file.size("data/corpus/qd_mycorpus_states.RDS")/1000000,3), "Mo")
[1] "Size of resulting file =  5.721 Mo"

Thematic tags

qd <-readRDS("data/corpus/qd_mycorpus_states.RDS")

Load tagging function

extract_tags <- function(qd = qd,                      # the corpus of interest
                         lang = "fr",                  # the language to be used
                         dict = dict,                  # the dictionary of target 
                         code = "code" ,                # variable used for coding
                         alias = "alias",               # variable used for alias
                         tagsname = "states",           # name of the tags column
                         split  = c("'","’","-"),       # split list
                         tolow = TRUE  ,                # Tokenize text
                         comps = c("Afrique du sud")  # compounds
                         )
{ 


  
# Tokenize  
x<-as.character(qd)


if(length(split) > 0) { reg<-paste(split, collapse = '|')
                       x <- gsub(reg," ",x)}  
if(tolow) { x <- tolower(x)} 
toks<-tokens(x)

# compounds
if(length(split) > 0) { reg<-paste(split, collapse = '|')
                       comps<- gsub(reg," ",comps)}  
if(tolow)       {comps <- tolower(comps)}  
toks<-tokens_compound(toks,pattern=phrase(comps))

  
# Load dictionaries and create compounds

  ## Target dictionary

labels <-dict[[alias]]
if(length(split) > 0) { reg<-paste(split, collapse = '|')
                       labels<- gsub(reg," ",labels)}  
if(tolow)       {labels <- tolower(labels)}  
toks<-tokens_compound(toks,pattern=phrase(labels))
  
 # create quanteda dictionary
keys <-gsub(" ","_",labels)
qd_dict<-as.list(keys)
names(qd_dict)<-dict[[code]]
qd_dict<-dictionary(qd_dict,tolower = FALSE)

# Identify geo tags (states or reg or org ...)
toks_tags <- tokens_lookup(toks, qd_dict, case_insensitive = F)
toks_tags <- lapply(toks_tags, unique)
toks_tags<-as.tokens(toks_tags)
list_tags<-function(x){res<-paste(x, collapse=' ')}
docvars(qd)[[tagsname]]<-as.character(lapply(toks_tags,FUN=list_tags))
docvars(qd)[[paste("nb",tagsname,sep="")]]<-ntoken(toks_tags)



# Export results
return(qd)
 }

The pandemic topic

Dictionary

We decide here to use lower case transformation. We use a star for the words that can take a plural form. We had of course covid and coronavirus

label <- c("pandemic*", "epidemic*", "virus", "world health organisation", "ebola",  "h1n1","sras", "chikungunya", "cholera", "flu", "covid*","coronavirus")
code  <- rep("pand", length(label))
lang  <- rep("en", length(label))
dict_pande <- data.frame(code,lang,label)
kable(dict_pande)
code lang label
pand en pandemic*
pand en epidemic*
pand en virus
pand en world health organisation
pand en ebola
pand en h1n1
pand en sras
pand en chikungunya
pand en cholera
pand en flu
pand en covid*
pand en coronavirus

encomps<-c("computer virus")

Annotation

qd <- extract_tags (qd = qd,
                     lang="en",
                     dict = dict_pande,
                     code = "code",
                     alias = "label",
                    tagsname = "pand",
                     split = c("'","’","-"),
                     comps = encomps,
                     tolow = TRUE)

table(qd$nbpand)

saveRDS(qd,"data/corpus/qd_mycorpus_states_topic.RDS")

Visualization

x<-data.table(docvars(qd))
x$tag<-x$nbpand !=0
tab<-x[,.(tot=.N),by=.(month,tag, who)]
tab<-tab[tab$tag==TRUE,]
tab$month<-as.Date(tab$month)

       
       p<-ggplot(tab, aes(x=month,fill =who, y=tot))+
         geom_bar(stat="identity")+
         ggtitle(label ="Pandemic : distribution of tags by month and media")
p

Hypercubes creation

Aggregation function


#' @title create an hypercube
#' @name hypercube
#' @description create a network of interlinked states
#' @param corpus a corpus of news in quanteda format
#' @param order an order of sentences in the news
#' @param who the source dimension
#' @param when the time dimension
#' @param timespan aggreation of time
#' @param what a list of topics
#' @param where1 a list of states
#' @param where2  a list of states


hypercube   <- function( corpus = qd,
                        order = "order",
                        who = "source",
                        when = "when",
                        timespan = "week",
                        what = "what",
                        where1 = "where1",
                        where2 = "where2")
{


  
# prepare data

  don<-docvars(corpus)
  
  df<-data.table(id     = docid(corpus),
                 order  = don[[order]],
                 who    = don[[who]],
                 when   = don[[when]],
                 what   = don[[what]],
                 where1 = don[[where1]],
                 where2 = don[[where2]])

  # adjust id
 df$id<-paste(df$id,"_",df$order,sep="")
 
# change time span
  df$when<-as.character(cut(as.Date(df$when), timespan, start.on.monday = TRUE))

# unnest where1
  df$where1[df$where1==""]<-"_no_"
  df<-unnest_tokens(df,where1,where1,to_lower=F)
  
# unnest where2
  df$where2[df$where2==""]<-"_no_"
  df<-unnest_tokens(df,where2,where2,to_lower=F) 
  
# unnest what
  df$what[df$what==""]<-"_no_"
  df<-unnest_tokens(df,what,what,to_lower=F) 
  


# Compute weight of news
  newswgt<-df[,list(wgt=1/.N),list(id)]
  df <- merge(df,newswgt, by="id")


# ------------------------ Hypercube creation --------------------#
  
  
# Aggregate
  hc<- df[,.(tags = .N, news=sum(wgt)) ,.(order,who, when,where1,where2, what)]
  
# Convert date to time
  hc$when<-as.Date(hc$when)
  
# export
  return(hc)
  
}

Pandemic hypercube

qd<-readRDS("data/corpus/qd_mycorpus_states_topic.RDS")
qd$order<-1

hc<-hypercube( corpus   = qd,
                    order    = "order",
                    who      = "who",
                    when     = "when",
                    timespan = "day",
                    what     = "pand",
                    where1   = "states",
                    where2   = "states")

saveRDS(hc,"data/corpus/hc_mycorpus_states_pand.RDS")
paste("Size of resulting file = ",round(file.size("data/corpus/hc_mycorpus_states_pand.RDS")/1000000,3), "Mo")
[1] "Size of resulting file =  0.133 Mo"

Hypercubes exploration

#### ---------------- testchi2 ----------------
#' @title  Compute the average salience of the topic and test significance of deviation
#' @name what
#' @description create a table and graphic of the topic
#' @param tabtest a table with variable trial, success and null.value
#' @param minsamp : Threshold of sample size requested for salience computation
#' @param mintest : Threshold of estimated value requested for chi-square test


testchi2<-function(tabtest=tabtest,
                   minsamp = 20,
                   mintest = 5) 
{
  tab<-tabtest
  n<-dim(tab)[1]
  
  # Compute salience if sample size sufficient (default : N>20)
  tab$estimate <-NA
  tab$salience <-NA
  tab$chi2<-NA
  tab$p.value<-NA
  if (tab$trial > minsamp){ tab$estimate<-round(tab$success/tab$trial,5)
  tab$salience<-tab$estimate/tab$null.value
  
  # Chi-square test if estimated value sufficient (default : Nij* > 5)
  
  for (i in 1:n) {
    if(tab$trial[i]*tab$null.value[i]>=mintest) {  
      test<-prop.test(x=tab$success[i],n=tab$trial[i], p=tab$null.value[i], 
                      alternative = "greater")
      tab$chi2[i]<-round(test$statistic,2)
      tab$p.value[i]<-round(test$p.value,5)
    } 
  }
  }
  return(tab)
}
hc <- readRDS("data/corpus/hc_mycorpus_states_pand.RDS")

Topic frequence (What ?)

Function

### ---------------- what ----------------
#' @title  Compute the average salience of the topic
#' @name what
#' @description create a table and graphic of the topic
#' @param hc an hypercube prepared as data.table
#' @param subtop a subtag of the main tag (default = NA)
#' @param title Title of the graphic


what <- function (hc = hypercube,
                  subtop = NA,
                  title = "What ?")
{
 
  
tab<-hc
if (is.na(subtop)){tab$what <-tab$what !="_no_"}else {tab$what <- tab$what == subtop}

tab<-tab[,list(news = sum(news)),by = what]
tab$pct<-100*tab$news/sum(tab$news)

p <- plot_ly(tab,
             labels = ~what,
             values = ~pct,
             type = 'pie') %>%
  layout(title = title,
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))

output<-list("table" = tab, "plotly" =p)

return(output)

}

Example

res_what <- what(hc = hc,
             subtop = NA,
             title = "Topic news")
res_what$table
res_what$plotly

Topic variation by media (who.what)

The function who.what explore the variation of interest for the topic in the different media of the corpus.

Function


#### ---------------- who.what ----------------
#' @title  visualize variation of the topic between media
#' @name who.what
#' @description create a table of variation of the topic by media
#' @param hc an hypercube prepared as data.table
#' @param test : visualize test (TRUE) or salience (FALSE)
#' @param minsamp : Threshold of sample size requested for salience computation
#' @param mintest sample size of estimate for chi-square test (default = 5)
#' @param title Title of the graphic


who.what <- function (hc = hypercube,
                      test = FALSE,
                      minsamp = 20,
                      mintest = 5,
                      title = "Who says What ?")
{
  
  tab<-hc
  {tab$what <-tab$what !="_no_"}
  
  tab<-tab[,list(trial = sum(news),success=round(sum(news*what),0)),by = list(who)]
  ref <-round(sum(tab$success)/sum(tab$trial),4)
  tab$null.value<-ref
  
  tab<-testchi2(tabtest=tab,
                minsamp = minsamp,
                mintest = mintest)
  
  
  
  if (test==FALSE) {tab$index =tab$salience
  tab<-tab[tab$trial > minsamp,]
  mycol<-brewer.pal(7,"YlOrRd")
  } 
  else {tab$index=tab$p.value
  tab<-tab[tab$trial*tab$null.value>mintest,]
  mycol<-brewer.pal(7,"RdYlBu")
  mycol[4]<-"lightyellow"
  }
  
  p <- plot_ly(tab,
               x = ~who,
               y = ~estimate*100,
               color= ~index,
               colors= mycol,
               hoverinfo = "text",
               text = ~paste('Source: ',who,
                             '<br /> Total news  : ', round(trial,0),
                             '<br /> Topic news : ', round(success,0),
                             '<br /> % observed  : ', round(estimate*100,2),'%',
                             '<br /> % estimated : ', round(null.value*100,2),'%',
                             '<br /> Salience : ', round(salience,2),  
                             '<br /> p.value : ', round(p.value,4)),
               type = "bar")  %>%
    layout(title = title,
           yaxis = list(title = "% news"),
           barmode = 'stack')
  
  output<-list("table" = tab, "plotly" =p)
  
  return(output)
  
}

Example





res_who_what<- who.what(hc=hc, 
                        test = TRUE,
                        minsamp = 5,
                        mintest = 1,
                        title = "Topic news by media - Significance")
res_who_what$plotly
NA

Topic variation through time (when.what)

Function

#### ---------------- when.what ----------------
#' @title  visualize variation of the topic through time
#' @name when.what
#' @description create a table of variation of the topic by media
#' @param test : visualize test (TRUE) or salience (FALSE)
#' @param minsamp : Threshold of sample size requested for salience computation
#' @param mintest sample size of estimate for chi-square test (default = 5)
#' @param title Title of the graphic


when.what <- function (hc = hypercube,
                       test = FALSE,
                       minsamp = 20,
                       mintest = 5,
                       title = "Who says What ?")
{
  
  tab<-hc
  {tab$what <-tab$what !="_no_"}
  
  tab<-tab[,list(trial = sum(news),success=round(sum(news*what),0)),by = list(when)]
  ref <-round(sum(tab$success)/sum(tab$trial),4)
  tab$null.value<-ref
  
  tab<-testchi2(tabtest=tab,
                minsamp = minsamp,
                mintest = mintest)
  
  if (test==FALSE) {tab$index =tab$salience
  tab<-tab[tab$trial > minsamp,]
  mycol<-brewer.pal(7,"YlOrRd")
  } 
  else {tab$index=tab$p.value
  tab<-tab[tab$trial*tab$null.value>mintest,]
  mycol<-brewer.pal(7,"RdYlBu")
  mycol[4]<-"lightyellow"
  }
  
  
  p <- plot_ly(tab,
               x = ~as.character(when),
               y = ~estimate*100,
               color= ~index,
               colors= mycol,
               hoverinfo = "text",
               text = ~paste('Time: ',when,
                             '<br /> Total news  : ', round(trial,0),
                             '<br /> Topic news : ', round(success,0),
                             '<br /> % observed  : ', round(estimate*100,2),'%',
                             '<br /> % estimated : ', round(null.value*100,2),'%',
                             '<br /> Salience : ', round(salience,2),  
                             '<br /> p.value : ', round(p.value,4)),
               type = "bar")  %>%
    layout(title = title,
           yaxis = list(title = "% news"),
           barmode = 'stack')
  
  output<-list("table" = tab, "plotly" =p)
  
  return(output)
  
}

Example 1 : 2019-2020 by month

# Modify time period by month
hc2 <- hc %>% mutate(when = cut(when,breaks="month"))


res_when_what<- when.what(hc=hc2, 
                          test=TRUE,
                          minsamp=10,
                          mintest=5,
                          title = "Topic news by month - Significance")


res_when_what$plotly

Example 2 : 2020 by week

# Modify time period by month
hc2 <- hc %>% filter(substr(when,1,4)=="2020") %>% mutate(when = cut(when,breaks="week"))


res_when_what<- when.what(hc=hc2, 
                          test=TRUE,
                          minsamp=10,
                          mintest=5,
                          title = "Topic news by week - Significance")


res_when_what$plotly

Example 3 : Jan-March 2020 by day

# Modify time period by month
hc2 <- hc %>% filter(when > as.Date("2020-01-01"), when < as.Date("2020-04-01")) 


res_when_what<- when.what(hc=hc2, 
                          test=TRUE,
                          minsamp=10,
                          mintest=5,
                          title = "Topic news by day - Significance")


res_when_what$plotly

Topic variation through space (where.what)

Function


#### ---------------- where.what ----------------
#' @title  visualize spatialization of the topic 
#' @name where.what
#' @description create a table of variation of the topic by media
#' @param hc an hypercube prepared as data.table
#' @param test : visualize test (TRUE) or salience (FALSE)
#' @param minsamp : Threshold of sample size requested for salience computation
#' @param mintest sample size of estimate for chi-square test (default = 5)
#' @param map a map with coordinates in lat-long
#' @param proj a projection accepted by plotly
#' @param title Title of the graphic


where.what <- function (hc = hypercube,
                        test = FALSE,
                        minsamp = 20,
                        mintest = 5,
                        map = world_ctr,
                        proj = 'azimuthal equal area',
                        title = "Where said What ?")
{
  
  tab<-hc
  tab$what <-tab$what !="_no_"
  
  tab<-tab[,list(trial = round(sum(news),0),success=round(sum(news*what),0)),by = list(where1)]
  ref <-round(sum(tab$success)/sum(tab$trial),4)
  tab$null.value<-ref
  
  tab<-testchi2(tabtest=tab,
                minsamp = minsamp,
                mintest = mintest)
  
  
  
  tab<-tab[order(-chi2),]
  
  
  
  if (test==FALSE) {tab$index =tab$salience
  tab<-tab[tab$trial > minsamp,]
  mycol<-brewer.pal(7,"YlOrRd")
  } 
  else {tab$index=tab$p.value
  tab<-tab[tab$trial*tab$null.value>mintest,]
  mycol<-brewer.pal(7,"RdYlBu")
  mycol[4]<-"lightyellow"
  }
  
  
  map<-merge(map,tab,all.x=T,all.y=F,by.x="ISO3",by.y="where1")
  
  
  
  #map2<-map[is.na(map$pct)==F,]
  #map2<-st_centroid(map2)
  #map2<-st_drop_geometry(map2)
  
  
  g <- list(showframe = TRUE,
            framecolor= toRGB("gray20"),
            coastlinecolor = toRGB("gray20"),
            showland = TRUE,
            landcolor = toRGB("gray50"),
            showcountries = TRUE,
            countrycolor = toRGB("white"),
            countrywidth = 0.2,
            projection = list(type = proj))
  
  
  
  p<- plot_geo(map)%>%
    add_markers(x = ~lon,
                y = ~lat,
                sizes = c(0, 250),
                size = ~success,
                #             color= ~signif,
                color = ~index,
                colors= mycol,
                hoverinfo = "text",
                text = ~paste('Location: ',NAME,
                              '<br /> Total news  : ', round(trial,0),
                              '<br /> Topic news : ', round(success,0),
                              '<br /> % observed  : ', round(estimate*100,2),'%',
                              '<br /> % estimated : ', round(null.value*100,2),'%',
                              '<br /> Salience : ', round(salience,2),  
                              '<br /> p.value : ', round(p.value,4))) %>%
    
    layout(geo = g,
           title = title)
  
  
  
  output<-list("table" = tab, "plotly" =p)
  
  return(output)
  
}

Example

map<-readRDS("data/dico_states/world_ctr_4326.Rdata")
hc2<-hc %>% filter(where1 !="_no_", where2 !="_no_") %>% filter(where1 !="GBR", where2 !="GBR")

res_where_what<- where.what(hc=hc2,
                            test=TRUE,
                            minsamp=10,
                            map = map, 
                            mintest =2,
                            title = "Topic news by states - Significance")
res_where_what$plotly

Spatial Networks

Geo networks modelisation

Hypercube Filter (function)

hc_filter <- function(don = hc,
                      who = "who",
                      when = "when",
                      where1 = "where1",
                      where2 = "where2",
                      wgt = "tags",
                      self = FALSE,
                      when_start = NA,
                      when_end = NA,
                      who_exc = NA,
                      who_inc = NA,
                      where1_exc = NA,
                      where1_inc = NA,
                      where2_exc = NA,
                      where2_inc = NA)

  {                          
  
    df<-data.table(who = don[[who]],
                   when = don[[when]],
                   where1 = don[[where1]],
                   where2 = don[[where2]],
                   wgt = don[[wgt]])
    
    # Select time period
        if (is.na(when_start)==FALSE) { 
        df <- df[when >= as.Date(when_start), ]}
        if (is.na(when_end)==FALSE) { 
        df <- df[when <= as.Date(when_end), ]}
    # Select who
        if (is.na(who_exc)==FALSE) { 
        df <- df[!(who %in% who_exc), ]}
        if (is.na(who_inc)==FALSE) { 
        df <- df[(who %in% who_inc), ]}
    # Select where1
        if (is.na(where1_exc)==FALSE) { 
        df <- df[!(where1 %in% where1_exc), ]}
        if (is.na(where1_inc)==FALSE) { 
        df <- df[(where1 %in% where1_inc), ]}
    # Select where2
        if (is.na(where2_exc)==FALSE) { 
        df <- df[!(where2 %in% where2_exc), ]}
        if (is.na(where2_inc)==FALSE) { 
        df <- df[(where2 %in% where2_inc), ]}
    # eliminate internal links
       if (self==FALSE) { 
        df <- df[(where1 != where2), ]}
    return(df)
  
}

Matrix builder (function)

build_int <- function(don = don,       # a dataframe with columns i, j , Fij
                      i = "where1",
                      j = "where2",
                      Fij = "wgt",
                      s1 = 10,
                      s2 = 10,
                      n1 = 2,
                      n2 = 2,
                      k = 1)

{  
  df<-data.table(i=don[[i]],j=don[[j]],Fij=don[[Fij]])
  int <-df[,.(Fij=sum(Fij)),.(i,j)]
  int<-dcast(int,formula = i~j,fill = 0)
  mat<-as.matrix(int[,-1])
  row.names(mat)<-int$i
  mat<-mat[apply(mat,1,sum)>=s1,apply(mat,2,sum)>=s2 ]
  m0<-mat
  m0[m0<k]<-0
  m0[m0>=k]<-1
  mat<-mat[apply(m0,1,sum)>=n1,apply(m0,2,sum)>=n2 ]
  int<-reshape2::melt(mat)
  names(int) <-c("i","j","Fij")
  return(int)
}

Random model (function)


rand_int <- function(int = int, # A table with columns i, j Fij
                     maxsize = 100000,
                     diag    = FALSE,
                     resid   = FALSE) {
    # Eliminate diagonal ?
    if (diag==FALSE) { 
        int <- int[as.character(int$i) != as.character(int$j), ]}
  
    # Compute model if size not too large
    if (dim(int)[1] < maxsize) {
       # Proceed to poisson regression model
       mod <- glm( formula = Fij ~ i + j,family = "poisson", data = int)
  
       # Add residuals if requested
       if(resid == TRUE)   { 
          # Add estimates
          int$Eij <- mod$fitted.values

          # Add absolute residuals
          int$Rabs_ij <- int$Fij-int$Eij

          # Add relative residuals
          int$Rrel_ij <- int$Fij/int$Eij

          # Add chi-square residuals
          int$Rchi_ij <-  (int$Rabs_ij)**2 / int$Eij
          int$Rchi_ij[int$Rabs_ij<0]<- -int$Rchi_ij[int$Rabs_ij<0]
          }
         
    } else { paste ("Table > 100000 -  \n 
                     modify maxsize =  parameter \n
                     if you are sure that your computer can do it !")}
  # Export results
  int$i<-as.character(int$i)
  int$j<-as.character(int$j)
  return(int)
  
 }

Visualize network (function)

geo_network<- function(don = don,
                       from = "i",
                        to = "j", 
                        size = "Fij",
                        minsize = 1,
                        maxsize = NA,
                        test = "Fij",
                        mintest = 1,
                        loops  = FALSE, 
                        title = "Network")

{
int<-data.frame(i = as.character(don[,from]),
                j = as.character(don[,to]),
                size = don[,size],
                test = don[,test]
                )
if (is.na(minsize)==FALSE) {int =int[int$size >= minsize,]} 
if (is.na(maxsize)==FALSE) {int =int[int$size <= maxsize,]} 
if (is.na(mintest)==FALSE) {int =int[int$test >= mintest,]}

nodes<-data.frame(code = unique(c(int$i,int$j)))
nodes$code<-as.character(nodes$code)
nodes$id<-1:length(nodes$code)
nodes$label<-nodes$code
nodes$color <-"gray"
nodes$color[nodes$code %in% int$j]<-"red"


# Adjust edge codes
edges <- int %>% mutate(width = 5+5*size / max(size)) %>%
                left_join(nodes %>% select(i=code, from = id)) %>%  
                left_join(nodes %>% select(j=code, to = id )) 

# compute nodesize
toti<-int %>% group_by(i) %>% summarize(size =sum(size)) %>% select (code=i,size)
totj<-int %>% group_by(j) %>% summarize(size =sum(size)) %>% select (code=j,size)
tot<-rbind(toti,totj)
tot<-unique(tot)
tot$code<-as.factor(tot$code)
nodes <- left_join(nodes,tot) %>% mutate(value = 1 +5*sqrt(size/max(size)))


#sel_nodes <-nodes %>% filter(code %in% unique(c(sel_edges$i,sel_edges$j)))

# eliminate loops

if(loops == FALSE) {edges <- edges[edges$from < edges$to,]}

net<- visNetwork(nodes, 
                  edges, 
                  main = title,
height = "1000px", 
                  width = "70%")   %>%   
   visNodes(scaling =list(min =20, max=60, 
                          label=list(min=20,max=80, 
                                    maxVisible = 20)))%>%
  visEdges(scaling = list(min=20,max=60))%>%
       visOptions(highlightNearest = TRUE,
     #               selectedBy = "group", 
    #               manipulation = TRUE,
                  nodesIdSelection = TRUE) %>%
        visInteraction(navigationButtons = TRUE) %>%
         visLegend() %>%
      visIgraphLayout(layout ="layout.fruchterman.reingold",smooth = TRUE)

net
 return(net)
 } 

Application

# Load complete hypercube
hc <- readRDS("data/corpus/hc_mycorpus_states_pand.RDS")
hc <- hc %>% filter(where1 !="_no_", where2 != "_no_")

# Eliminate non foreign news 
hc<-hc[hc$where1 != substr(who,4,6),]
hc<-hc[hc$where2 != substr(who,4,6),]

# Add complete labels
map<-readRDS("data/dico_states/world_map_4326.Rdata")
labs<-st_drop_geometry(map)
labs<-labs[,c(1,4)]

# Shorten the name of USA
labs$NAME[labs$ISO3=="USA"]<-"U.S.A."


names(labs)<-c("where1","geofr1")
hc<-left_join(hc,labs)
Joining, by = "where1"
names(labs)<-c("where2","geofr2")
hc<-left_join(hc,labs)
Joining, by = "where2"
hc_geo_geo <- hc

Reference network

hc<-hc_geo_geo 



hc<-hc_filter(don = hc,
                             wgt = "news",
                             where1 = "geofr1",
                             where2 = "geofr2",
                             where1_exc = c("_no_"),
                             where2_exc = c("_no_"),
                             self = FALSE
                           )

int <- build_int(don = hc,
                 s1=2,
                 s2=2,
                 n1=1,
                 n2=1,
                 k=0)
Using 'Fij' as value column. Use 'value.var' to override
mod<-rand_int(int,
              resid = TRUE,
              diag = FALSE)


network<- geo_network(mod,
                      size = "Fij",
                      minsize = 1,
                      test = "Rchi_ij",
                      mintest = 3.84)
Joining, by = "i"
Joining, by = "j"
`summarise()` ungrouping output (override with `.groups` argument)
`summarise()` ungrouping output (override with `.groups` argument)
Joining, by = "code"
network
NA
---
title: "Geographical analysis of media"
subtitle: "Application "
author: "Claude Grasland"
output: html_notebook
---

```{r setup, include=FALSE}
library(knitr)

## Global options
options(max.print="75")
opts_chunk$set(echo=FALSE,
	             cache=FALSE,
               prompt=FALSE,
               tidy=FALSE,
               comment=NA,
               message=FALSE,
               warning=FALSE)


# Basic packages
library(dplyr)
library(data.table)
library(lubridate)

# Graphic packages
library(ggplot2)
library(plotly)
library(RColorBrewer)
library(visNetwork)

# Spatial packages
library(sf)

# text mining packages
library(quanteda)
library(quanteda.textplots)
library(tidytext)
library(readtext)
library(stringr)
library(readr)
```




# Corpus preparation

We use here data from Media cloud with only title of news. Therfore the program is more simple than in course.

## Selection of media


```{r corp_fr, eval=FALSE}

# Load data with the function fread (fast) and the encoding UTF-8
df1<-fread("data/source/en_GBR_guardi_2019_2020.csv", encoding = "UTF-8")
df1$media<-"en_GBR_guardi"


df2<-fread("data/source/en_GBR_telegr_2019_2020.csv", encoding = "UTF-8")
df2$media<-"en_GBR_telegr"


# transform in data.table format
df<-rbind(df1,df2)
rm(df1,df2)


# select column of interest
df$id <- df$stories_id
df$who <- df$media
df$when <- as.Date(df$publish_date)
df$text <- df$title
df<-df[,c("id","who","when","text")]
df<-df[order(when),]

# select period of interest
mintime<-as.Date("2019-01-01")
maxtime<-as.Date("2020-12-31")
df<-df[(is.na(df$when)==F),] 
df<-df[as.Date(df$when) >= mintime,]
df<-df[as.Date(df$when) <= maxtime,]

# eliminate duplicate
df<-df[duplicated(df$text)==F,]



```


## Check of time frequency


### Time divisions

We transform the previous data.frame in a data.table format for easier operations of aggregation 

```{r time div, eval = FALSE}
dt<-as.data.table(df)
dt$day     <- as.Date(dt$when)
dt$week    <- cut(dt$day, "weeks", start.on.monday=TRUE)
dt$month   <- cut(dt$day, "months")
dt$weekday <- weekdays(dt$day)

# Save data frame
saveRDS(dt,"data/corpus/dt_mycorpus.RDS") 
```


### News by week




We examine if the distribution is regular by week for the different media of the corpus.

```{r news_week_fr}
dt<-readRDS("data/corpus/dt_mycorpus.RDS")
news_weeks<-dt[,.(newstot=.N),by=.(week,who)]

p<-ggplot(news_weeks, aes(x=as.Date(week),y=newstot, col=who))+
   geom_line()+
   geom_smooth(method = 'loess', formula = 'y~x')+
   scale_y_continuous("Number of news", limits = c(0,NA)) +
   scale_x_date("Week (starting on monday)") +
         ggtitle(label ="Corpus : distribution of news by week")
p
```

### News by weekday

We examine if the distribution is regular by weekday and check in particular the effect of the week-end.

```{r news_weekdays_fr}
#compute frequencies by weekday
news_weekdays<-dt[,.(newstot=.N),by=.(weekday,who)]
news_weekdays<-news_weekdays[,.(weekday,newspct=100*newstot/sum(newstot)),by=.(who)]


# Translate weekdays in english and order
news_weekdays$weekday<-as.factor(news_weekdays$weekday)
levels(news_weekdays$weekday)<-c("7.Sunday","4.Wednesday","1.Monday","2.Tuesday","3.Thursday","6.Sathurday","5.Friday")
news_weekdays$weekday<-as.factor(as.character(news_weekdays$weekday))
news_weekdays<-news_weekdays[order(news_weekdays$weekday),]


p<-ggplot(news_weekdays, aes(x=weekday,fill = who, y=newspct))+
         geom_bar(position = "dodge", stat="identity")+
         scale_y_continuous("Share of news (%)", limits = c(0,NA)) +
         ggtitle(label ="Corpus : distribution of news by week day")
p
```


## Transform in quanteda corpus



```{r sent_fr}

# transform in quanteda
qd<-corpus(dt,docid_field = "id",text_field = "text")


# filter by number of tokens by sentence
qd$nbt<-ntoken(texts(qd))
qd<-corpus_subset(qd, nbt<100)
qd<-corpus_subset(qd, nbt>2)



# Save corpus in qd format
saveRDS(qd,"data/corpus/qd_mycorpus.RDS")



head(qd)
summary(qd,3)
```





# Geographical tags


## Preparation of data


### Load dictonary

We start by loading the last version of the Imageun dictionary and we extract our target language (here : english).

```{r load_dict}
# Load multilanguage dictionary
dict<-fread("data/dico_states/global_state_V2.csv")

# Select french dictionary
dict <- dict[dict$lang=="en",]


head(dict)
```

### Load corpus

```{r}
qd <- readRDS("data/corpus/qd_mycorpus.RDS")
```

### Load tagging function

```{r func_annotate}
extract_tags <- function(qd = qd,                      # the corpus of interest
                         lang = "fr",                  # the language to be used
                         dict = dict,                  # the dictionary of target 
                         code = "ISO3" ,                # variable used for coding
                         alias = "x",                   # variable used for alias
                         tagsname = "states",           # name of the tags column
                         split  = c("'","’","-"),       # split list
                         tolow = TRUE  ,                # Tokenize text
                         comps = c("Afrique du sud")  # compounds
                         )
{ 


  
# Tokenize  
x<-as.character(qd)


if(length(split) > 0) { reg<-paste(split, collapse = '|')
                       x <- gsub(reg," ",x)}  
if(tolow) { x <- tolower(x)} 
toks<-tokens(x)

# compounds
if(length(split) > 0) { reg<-paste(split, collapse = '|')
                       comps<- gsub(reg," ",comps)}  
if(tolow)       {comps <- tolower(comps)}  
toks<-tokens_compound(toks,pattern=phrase(comps))

  
# Load dictionaries and create compounds

  ## Target dictionary

labels <-dict[[alias]]
if(length(split) > 0) { reg<-paste(split, collapse = '|')
                       labels<- gsub(reg," ",labels)}  
if(tolow)       {labels <- tolower(labels)}  
toks<-tokens_compound(toks,pattern=phrase(labels))
  
 # create quanteda dictionary
keys <-gsub(" ","_",labels)
qd_dict<-as.list(keys)
names(qd_dict)<-dict[[code]]
qd_dict<-dictionary(qd_dict,tolower = FALSE)

# Identify geo tags (states or reg or org ...)
toks_tags <- tokens_lookup(toks, qd_dict, case_insensitive = F)
toks_tags <- lapply(toks_tags, unique)
toks_tags<-as.tokens(toks_tags)
list_tags<-function(x){res<-paste(x, collapse=' ')}
docvars(qd)[[tagsname]]<-as.character(lapply(toks_tags,FUN=list_tags))
docvars(qd)[[paste("nb",tagsname,sep="")]]<-ntoken(toks_tags)



# Export results
return(qd)
 }
```



## Geographical annotation

### Annotate all entities


```{r annotate, eval=FALSE}



t1<-Sys.time()

frcomps<-c("China sea", "indian ocean", "american continent", "american countries")

qd <- extract_tags (qd = qd,
                     lang="en",
                     dict = dict,
                     code = "ISO3",
                     alias = "x",
                     tagsname = "states",
                     split = c("'","’","-"),
                     comps = frcomps,
                     tolow = TRUE)

t2 = Sys.time()
paste("Program executed in ", t2-t1)

table(qd$nbstates)


```



### check news with maximum state number

```{r check_states_news}
table(qd$nbstates)
check<-corpus_subset(qd,nbstates>3)
x<-data.frame(who=check$who,when = check$when,text=as.character(check),states=check$states,nbstates=check$nbstates)
x<-x[order(x$nbstates,decreasing = T),]
kable(x)
```







### Save geographically anotated corpus

```{r}
saveRDS(qd,"data/corpus/qd_mycorpus_states.RDS")
paste("Size of resulting file = ",round(file.size("data/corpus/qd_mycorpus_states.RDS")/1000000,3), "Mo")
```




# Thematic tags



```{r}
qd <-readRDS("data/corpus/qd_mycorpus_states.RDS")
```



## Load tagging function


```{r func_annotate2}
extract_tags <- function(qd = qd,                      # the corpus of interest
                         lang = "fr",                  # the language to be used
                         dict = dict,                  # the dictionary of target 
                         code = "code" ,                # variable used for coding
                         alias = "alias",               # variable used for alias
                         tagsname = "states",           # name of the tags column
                         split  = c("'","’","-"),       # split list
                         tolow = TRUE  ,                # Tokenize text
                         comps = c("Afrique du sud")  # compounds
                         )
{ 


  
# Tokenize  
x<-as.character(qd)


if(length(split) > 0) { reg<-paste(split, collapse = '|')
                       x <- gsub(reg," ",x)}  
if(tolow) { x <- tolower(x)} 
toks<-tokens(x)

# compounds
if(length(split) > 0) { reg<-paste(split, collapse = '|')
                       comps<- gsub(reg," ",comps)}  
if(tolow)       {comps <- tolower(comps)}  
toks<-tokens_compound(toks,pattern=phrase(comps))

  
# Load dictionaries and create compounds

  ## Target dictionary

labels <-dict[[alias]]
if(length(split) > 0) { reg<-paste(split, collapse = '|')
                       labels<- gsub(reg," ",labels)}  
if(tolow)       {labels <- tolower(labels)}  
toks<-tokens_compound(toks,pattern=phrase(labels))
  
 # create quanteda dictionary
keys <-gsub(" ","_",labels)
qd_dict<-as.list(keys)
names(qd_dict)<-dict[[code]]
qd_dict<-dictionary(qd_dict,tolower = FALSE)

# Identify geo tags (states or reg or org ...)
toks_tags <- tokens_lookup(toks, qd_dict, case_insensitive = F)
toks_tags <- lapply(toks_tags, unique)
toks_tags<-as.tokens(toks_tags)
list_tags<-function(x){res<-paste(x, collapse=' ')}
docvars(qd)[[tagsname]]<-as.character(lapply(toks_tags,FUN=list_tags))
docvars(qd)[[paste("nb",tagsname,sep="")]]<-ntoken(toks_tags)



# Export results
return(qd)
 }
```


## The pandemic topic

### Dictionary

We decide here to use lower case transformation. We use a star for the words that can take a plural form. We had of course covid and coronavirus 

```{r dico pandemic}
label <- c("pandemic*", "epidemic*", "virus", "world health organisation", "ebola",  "h1n1","sras", "chikungunya", "cholera", "flu", "covid*","coronavirus")
code  <- rep("pand", length(label))
lang  <- rep("en", length(label))
dict_pande <- data.frame(code,lang,label)
kable(dict_pande)

encomps<-c("computer virus")
```


### Annotation

```{r annotate pandemic, eval=FALSE}

qd <- extract_tags (qd = qd,
                     lang="en",
                     dict = dict_pande,
                     code = "code",
                     alias = "label",
                    tagsname = "pand",
                     split = c("'","’","-"),
                     comps = encomps,
                     tolow = TRUE)

table(qd$nbpand)

saveRDS(qd,"data/corpus/qd_mycorpus_states_topic.RDS")

```


### Visualization


```{r visualization pandemic}
x<-data.table(docvars(qd))
x$tag<-x$nbpand !=0
tab<-x[,.(tot=.N),by=.(month,tag, who)]
tab<-tab[tab$tag==TRUE,]
tab$month<-as.Date(tab$month)

       
       p<-ggplot(tab, aes(x=month,fill =who, y=tot))+
         geom_bar(stat="identity")+
         ggtitle(label ="Pandemic : distribution of tags by month and media")
p
```




# Hypercubes creation



## Aggregation function


```{r}

#' @title create an hypercube
#' @name hypercube
#' @description create a network of interlinked states
#' @param corpus a corpus of news in quanteda format
#' @param order an order of sentences in the news
#' @param who the source dimension
#' @param when the time dimension
#' @param timespan aggreation of time
#' @param what a list of topics
#' @param where1 a list of states
#' @param where2  a list of states


hypercube   <- function( corpus = qd,
                        order = "order",
                        who = "source",
                        when = "when",
                        timespan = "week",
                        what = "what",
                        where1 = "where1",
                        where2 = "where2")
{


  
# prepare data

  don<-docvars(corpus)
  
  df<-data.table(id     = docid(corpus),
                 order  = don[[order]],
                 who    = don[[who]],
                 when   = don[[when]],
                 what   = don[[what]],
                 where1 = don[[where1]],
                 where2 = don[[where2]])

  # adjust id
 df$id<-paste(df$id,"_",df$order,sep="")
 
# change time span
  df$when<-as.character(cut(as.Date(df$when), timespan, start.on.monday = TRUE))

# unnest where1
  df$where1[df$where1==""]<-"_no_"
  df<-unnest_tokens(df,where1,where1,to_lower=F)
  
# unnest where2
  df$where2[df$where2==""]<-"_no_"
  df<-unnest_tokens(df,where2,where2,to_lower=F) 
  
# unnest what
  df$what[df$what==""]<-"_no_"
  df<-unnest_tokens(df,what,what,to_lower=F) 
  


# Compute weight of news
  newswgt<-df[,list(wgt=1/.N),list(id)]
  df <- merge(df,newswgt, by="id")


# ------------------------ Hypercube creation --------------------#
  
  
# Aggregate
  hc<- df[,.(tags = .N, news=sum(wgt)) ,.(order,who, when,where1,where2, what)]
  
# Convert date to time
  hc$when<-as.Date(hc$when)
  
# export
  return(hc)
  
}

```


## Pandemic hypercube

```{r}
qd<-readRDS("data/corpus/qd_mycorpus_states_topic.RDS")
qd$order<-1

hc<-hypercube( corpus   = qd,
                    order    = "order",
                    who      = "who",
                    when     = "when",
                    timespan = "day",
                    what     = "pand",
                    where1   = "states",
                    where2   = "states")

saveRDS(hc,"data/corpus/hc_mycorpus_states_pand.RDS")
paste("Size of resulting file = ",round(file.size("data/corpus/hc_mycorpus_states_pand.RDS")/1000000,3), "Mo")
```





# Hypercubes exploration





```{r}
#### ---------------- testchi2 ----------------
#' @title  Compute the average salience of the topic and test significance of deviation
#' @name what
#' @description create a table and graphic of the topic
#' @param tabtest a table with variable trial, success and null.value
#' @param minsamp : Threshold of sample size requested for salience computation
#' @param mintest : Threshold of estimated value requested for chi-square test


testchi2<-function(tabtest=tabtest,
                   minsamp = 20,
                   mintest = 5) 
{
  tab<-tabtest
  n<-dim(tab)[1]
  
  # Compute salience if sample size sufficient (default : N>20)
  tab$estimate <-NA
  tab$salience <-NA
  tab$chi2<-NA
  tab$p.value<-NA
  if (tab$trial > minsamp){ tab$estimate<-round(tab$success/tab$trial,5)
  tab$salience<-tab$estimate/tab$null.value
  
  # Chi-square test if estimated value sufficient (default : Nij* > 5)
  
  for (i in 1:n) {
    if(tab$trial[i]*tab$null.value[i]>=mintest) {  
      test<-prop.test(x=tab$success[i],n=tab$trial[i], p=tab$null.value[i], 
                      alternative = "greater")
      tab$chi2[i]<-round(test$statistic,2)
      tab$p.value[i]<-round(test$p.value,5)
    } 
  }
  }
  return(tab)
}

```




```{r}
hc <- readRDS("data/corpus/hc_mycorpus_states_pand.RDS")
```


## Topic frequence (What ?) 


### Function

```{r}
### ---------------- what ----------------
#' @title  Compute the average salience of the topic
#' @name what
#' @description create a table and graphic of the topic
#' @param hc an hypercube prepared as data.table
#' @param subtop a subtag of the main tag (default = NA)
#' @param title Title of the graphic


what <- function (hc = hypercube,
                  subtop = NA,
                  title = "What ?")
{
 
  
tab<-hc
if (is.na(subtop)){tab$what <-tab$what !="_no_"}else {tab$what <- tab$what == subtop}

tab<-tab[,list(news = sum(news)),by = what]
tab$pct<-100*tab$news/sum(tab$news)

p <- plot_ly(tab,
             labels = ~what,
             values = ~pct,
             type = 'pie') %>%
  layout(title = title,
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))

output<-list("table" = tab, "plotly" =p)

return(output)

}
```


### Example

```{r what example ,warning = FALSE, message = FALSE}
res_what <- what(hc = hc,
             subtop = NA,
             title = "Topic news")
res_what$table
res_what$plotly
```











## Topic variation by media (who.what)

The function who.what explore the variation of interest for the topic in the different media of the corpus.

### Function

```{r}

#### ---------------- who.what ----------------
#' @title  visualize variation of the topic between media
#' @name who.what
#' @description create a table of variation of the topic by media
#' @param hc an hypercube prepared as data.table
#' @param test : visualize test (TRUE) or salience (FALSE)
#' @param minsamp : Threshold of sample size requested for salience computation
#' @param mintest sample size of estimate for chi-square test (default = 5)
#' @param title Title of the graphic


who.what <- function (hc = hypercube,
                      test = FALSE,
                      minsamp = 20,
                      mintest = 5,
                      title = "Who says What ?")
{
  
  tab<-hc
  {tab$what <-tab$what !="_no_"}
  
  tab<-tab[,list(trial = sum(news),success=round(sum(news*what),0)),by = list(who)]
  ref <-round(sum(tab$success)/sum(tab$trial),4)
  tab$null.value<-ref
  
  tab<-testchi2(tabtest=tab,
                minsamp = minsamp,
                mintest = mintest)
  
  
  
  if (test==FALSE) {tab$index =tab$salience
  tab<-tab[tab$trial > minsamp,]
  mycol<-brewer.pal(7,"YlOrRd")
  } 
  else {tab$index=tab$p.value
  tab<-tab[tab$trial*tab$null.value>mintest,]
  mycol<-brewer.pal(7,"RdYlBu")
  mycol[4]<-"lightyellow"
  }
  
  p <- plot_ly(tab,
               x = ~who,
               y = ~estimate*100,
               color= ~index,
               colors= mycol,
               hoverinfo = "text",
               text = ~paste('Source: ',who,
                             '<br /> Total news  : ', round(trial,0),
                             '<br /> Topic news : ', round(success,0),
                             '<br /> % observed  : ', round(estimate*100,2),'%',
                             '<br /> % estimated : ', round(null.value*100,2),'%',
                             '<br /> Salience : ', round(salience,2),  
                             '<br /> p.value : ', round(p.value,4)),
               type = "bar")  %>%
    layout(title = title,
           yaxis = list(title = "% news"),
           barmode = 'stack')
  
  output<-list("table" = tab, "plotly" =p)
  
  return(output)
  
}
```


### Example


```{r who.what example,warning = FALSE, message = FALSE}




res_who_what<- who.what(hc=hc, 
                        test = TRUE,
                        minsamp = 5,
                        mintest = 1,
                        title = "Topic news by media - Significance")
res_who_what$plotly

```



## Topic variation through time (when.what)



### Function

```{r}
#### ---------------- when.what ----------------
#' @title  visualize variation of the topic through time
#' @name when.what
#' @description create a table of variation of the topic by media
#' @param test : visualize test (TRUE) or salience (FALSE)
#' @param minsamp : Threshold of sample size requested for salience computation
#' @param mintest sample size of estimate for chi-square test (default = 5)
#' @param title Title of the graphic


when.what <- function (hc = hypercube,
                       test = FALSE,
                       minsamp = 20,
                       mintest = 5,
                       title = "Who says What ?")
{
  
  tab<-hc
  {tab$what <-tab$what !="_no_"}
  
  tab<-tab[,list(trial = sum(news),success=round(sum(news*what),0)),by = list(when)]
  ref <-round(sum(tab$success)/sum(tab$trial),4)
  tab$null.value<-ref
  
  tab<-testchi2(tabtest=tab,
                minsamp = minsamp,
                mintest = mintest)
  
  if (test==FALSE) {tab$index =tab$salience
  tab<-tab[tab$trial > minsamp,]
  mycol<-brewer.pal(7,"YlOrRd")
  } 
  else {tab$index=tab$p.value
  tab<-tab[tab$trial*tab$null.value>mintest,]
  mycol<-brewer.pal(7,"RdYlBu")
  mycol[4]<-"lightyellow"
  }
  
  
  p <- plot_ly(tab,
               x = ~as.character(when),
               y = ~estimate*100,
               color= ~index,
               colors= mycol,
               hoverinfo = "text",
               text = ~paste('Time: ',when,
                             '<br /> Total news  : ', round(trial,0),
                             '<br /> Topic news : ', round(success,0),
                             '<br /> % observed  : ', round(estimate*100,2),'%',
                             '<br /> % estimated : ', round(null.value*100,2),'%',
                             '<br /> Salience : ', round(salience,2),  
                             '<br /> p.value : ', round(p.value,4)),
               type = "bar")  %>%
    layout(title = title,
           yaxis = list(title = "% news"),
           barmode = 'stack')
  
  output<-list("table" = tab, "plotly" =p)
  
  return(output)
  
}
```



### Example 1 : 2019-2020  by month


```{r when.what example1,warning = FALSE, message = FALSE}
# Modify time period by month
hc2 <- hc %>% mutate(when = cut(when,breaks="month"))


res_when_what<- when.what(hc=hc2, 
                          test=TRUE,
                          minsamp=10,
                          mintest=5,
                          title = "Topic news by month - Significance")


res_when_what$plotly
```



### Example 2 : 2020  by week

```{r when.what example2,warning = FALSE, message = FALSE}
# Modify time period by month
hc2 <- hc %>% filter(substr(when,1,4)=="2020") %>% mutate(when = cut(when,breaks="week"))


res_when_what<- when.what(hc=hc2, 
                          test=TRUE,
                          minsamp=10,
                          mintest=5,
                          title = "Topic news by week - Significance")


res_when_what$plotly
```

### Example 3 : Jan-March 2020 by day

```{r when.what example3,warning = FALSE, message = FALSE}
# Modify time period by month
hc2 <- hc %>% filter(when > as.Date("2020-01-01"), when < as.Date("2020-04-01")) 


res_when_what<- when.what(hc=hc2, 
                          test=TRUE,
                          minsamp=10,
                          mintest=5,
                          title = "Topic news by day - Significance")


res_when_what$plotly
```




## Topic variation through space (where.what)


### Function

```{r}

#### ---------------- where.what ----------------
#' @title  visualize spatialization of the topic 
#' @name where.what
#' @description create a table of variation of the topic by media
#' @param hc an hypercube prepared as data.table
#' @param test : visualize test (TRUE) or salience (FALSE)
#' @param minsamp : Threshold of sample size requested for salience computation
#' @param mintest sample size of estimate for chi-square test (default = 5)
#' @param map a map with coordinates in lat-long
#' @param proj a projection accepted by plotly
#' @param title Title of the graphic


where.what <- function (hc = hypercube,
                        test = FALSE,
                        minsamp = 20,
                        mintest = 5,
                        map = world_ctr,
                        proj = 'azimuthal equal area',
                        title = "Where said What ?")
{
  
  tab<-hc
  tab$what <-tab$what !="_no_"
  
  tab<-tab[,list(trial = round(sum(news),0),success=round(sum(news*what),0)),by = list(where1)]
  ref <-round(sum(tab$success)/sum(tab$trial),4)
  tab$null.value<-ref
  
  tab<-testchi2(tabtest=tab,
                minsamp = minsamp,
                mintest = mintest)
  
  
  
  tab<-tab[order(-chi2),]
  
  
  
  if (test==FALSE) {tab$index =tab$salience
  tab<-tab[tab$trial > minsamp,]
  mycol<-brewer.pal(7,"YlOrRd")
  } 
  else {tab$index=tab$p.value
  tab<-tab[tab$trial*tab$null.value>mintest,]
  mycol<-brewer.pal(7,"RdYlBu")
  mycol[4]<-"lightyellow"
  }
  
  
  map<-merge(map,tab,all.x=T,all.y=F,by.x="ISO3",by.y="where1")
  
  
  
  #map2<-map[is.na(map$pct)==F,]
  #map2<-st_centroid(map2)
  #map2<-st_drop_geometry(map2)
  
  
  g <- list(showframe = TRUE,
            framecolor= toRGB("gray20"),
            coastlinecolor = toRGB("gray20"),
            showland = TRUE,
            landcolor = toRGB("gray50"),
            showcountries = TRUE,
            countrycolor = toRGB("white"),
            countrywidth = 0.2,
            projection = list(type = proj))
  
  
  
  p<- plot_geo(map)%>%
    add_markers(x = ~lon,
                y = ~lat,
                sizes = c(0, 250),
                size = ~success,
                #             color= ~signif,
                color = ~index,
                colors= mycol,
                hoverinfo = "text",
                text = ~paste('Location: ',NAME,
                              '<br /> Total news  : ', round(trial,0),
                              '<br /> Topic news : ', round(success,0),
                              '<br /> % observed  : ', round(estimate*100,2),'%',
                              '<br /> % estimated : ', round(null.value*100,2),'%',
                              '<br /> Salience : ', round(salience,2),  
                              '<br /> p.value : ', round(p.value,4))) %>%
    
    layout(geo = g,
           title = title)
  
  
  
  output<-list("table" = tab, "plotly" =p)
  
  return(output)
  
}
```



### Example


```{r where.what example,warning = FALSE, message = FALSE}
map<-readRDS("data/dico_states/world_ctr_4326.Rdata")
hc2<-hc %>% filter(where1 !="_no_", where2 !="_no_") %>% filter(where1 !="GBR", where2 !="GBR")

res_where_what<- where.what(hc=hc2,
                            test=TRUE,
                            minsamp=10,
                            map = map, 
                            mintest =2,
                            title = "Topic news by states - Significance")
res_where_what$plotly
```



# Spatial Networks




## Geo networks modelisation


### Hypercube Filter (function)


```{r}
hc_filter <- function(don = hc,
                      who = "who",
                      when = "when",
                      where1 = "where1",
                      where2 = "where2",
                      wgt = "tags",
                      self = FALSE,
                      when_start = NA,
                      when_end = NA,
                      who_exc = NA,
                      who_inc = NA,
                      where1_exc = NA,
                      where1_inc = NA,
                      where2_exc = NA,
                      where2_inc = NA)

  {                          
  
    df<-data.table(who = don[[who]],
                   when = don[[when]],
                   where1 = don[[where1]],
                   where2 = don[[where2]],
                   wgt = don[[wgt]])
    
    # Select time period
        if (is.na(when_start)==FALSE) { 
        df <- df[when >= as.Date(when_start), ]}
        if (is.na(when_end)==FALSE) { 
        df <- df[when <= as.Date(when_end), ]}
    # Select who
        if (is.na(who_exc)==FALSE) { 
        df <- df[!(who %in% who_exc), ]}
        if (is.na(who_inc)==FALSE) { 
        df <- df[(who %in% who_inc), ]}
    # Select where1
        if (is.na(where1_exc)==FALSE) { 
        df <- df[!(where1 %in% where1_exc), ]}
        if (is.na(where1_inc)==FALSE) { 
        df <- df[(where1 %in% where1_inc), ]}
    # Select where2
        if (is.na(where2_exc)==FALSE) { 
        df <- df[!(where2 %in% where2_exc), ]}
        if (is.na(where2_inc)==FALSE) { 
        df <- df[(where2 %in% where2_inc), ]}
    # eliminate internal links
       if (self==FALSE) { 
        df <- df[(where1 != where2), ]}
    return(df)
  
}
```

### Matrix builder (function)




```{r}
build_int <- function(don = don,       # a dataframe with columns i, j , Fij
                      i = "where1",
                      j = "where2",
                      Fij = "wgt",
                      s1 = 10,
                      s2 = 10,
                      n1 = 2,
                      n2 = 2,
                      k = 1)

{  
  df<-data.table(i=don[[i]],j=don[[j]],Fij=don[[Fij]])
  int <-df[,.(Fij=sum(Fij)),.(i,j)]
  int<-dcast(int,formula = i~j,fill = 0)
  mat<-as.matrix(int[,-1])
  row.names(mat)<-int$i
  mat<-mat[apply(mat,1,sum)>=s1,apply(mat,2,sum)>=s2 ]
  m0<-mat
  m0[m0<k]<-0
  m0[m0>=k]<-1
  mat<-mat[apply(m0,1,sum)>=n1,apply(m0,2,sum)>=n2 ]
  int<-reshape2::melt(mat)
  names(int) <-c("i","j","Fij")
  return(int)
}

```


### Random model (function)


```{r}

rand_int <- function(int = int, # A table with columns i, j Fij
                     maxsize = 100000,
                     diag    = FALSE,
                     resid   = FALSE) {
    # Eliminate diagonal ?
    if (diag==FALSE) { 
        int <- int[as.character(int$i) != as.character(int$j), ]}
  
    # Compute model if size not too large
    if (dim(int)[1] < maxsize) {
       # Proceed to poisson regression model
       mod <- glm( formula = Fij ~ i + j,family = "poisson", data = int)
  
       # Add residuals if requested
       if(resid == TRUE)   { 
          # Add estimates
          int$Eij <- mod$fitted.values

          # Add absolute residuals
          int$Rabs_ij <- int$Fij-int$Eij

          # Add relative residuals
          int$Rrel_ij <- int$Fij/int$Eij

          # Add chi-square residuals
          int$Rchi_ij <-  (int$Rabs_ij)**2 / int$Eij
          int$Rchi_ij[int$Rabs_ij<0]<- -int$Rchi_ij[int$Rabs_ij<0]
          }
         
    } else { paste ("Table > 100000 -  \n 
                     modify maxsize =  parameter \n
                     if you are sure that your computer can do it !")}
  # Export results
  int$i<-as.character(int$i)
  int$j<-as.character(int$j)
  return(int)
  
 }

```


### Visualize network (function)

```{r}
geo_network<- function(don = don,
                       from = "i",
                        to = "j", 
                        size = "Fij",
                        minsize = 1,
                        maxsize = NA,
                        test = "Fij",
                        mintest = 1,
                        loops  = FALSE, 
                        title = "Network")

{
int<-data.frame(i = as.character(don[,from]),
                j = as.character(don[,to]),
                size = don[,size],
                test = don[,test]
                )
if (is.na(minsize)==FALSE) {int =int[int$size >= minsize,]} 
if (is.na(maxsize)==FALSE) {int =int[int$size <= maxsize,]} 
if (is.na(mintest)==FALSE) {int =int[int$test >= mintest,]}

nodes<-data.frame(code = unique(c(int$i,int$j)))
nodes$code<-as.character(nodes$code)
nodes$id<-1:length(nodes$code)
nodes$label<-nodes$code
nodes$color <-"gray"
nodes$color[nodes$code %in% int$j]<-"red"


# Adjust edge codes
edges <- int %>% mutate(width = 5+5*size / max(size)) %>%
                left_join(nodes %>% select(i=code, from = id)) %>%  
                left_join(nodes %>% select(j=code, to = id )) 

# compute nodesize
toti<-int %>% group_by(i) %>% summarize(size =sum(size)) %>% select (code=i,size)
totj<-int %>% group_by(j) %>% summarize(size =sum(size)) %>% select (code=j,size)
tot<-rbind(toti,totj)
tot<-unique(tot)
tot$code<-as.factor(tot$code)
nodes <- left_join(nodes,tot) %>% mutate(value = 1 +5*sqrt(size/max(size)))


#sel_nodes <-nodes %>% filter(code %in% unique(c(sel_edges$i,sel_edges$j)))

# eliminate loops

if(loops == FALSE) {edges <- edges[edges$from < edges$to,]}

net<- visNetwork(nodes, 
                  edges, 
                  main = title,
height = "1000px", 
                  width = "70%")   %>%   
   visNodes(scaling =list(min =20, max=60, 
                          label=list(min=20,max=80, 
                                    maxVisible = 20)))%>%
  visEdges(scaling = list(min=20,max=60))%>%
       visOptions(highlightNearest = TRUE,
     #               selectedBy = "group", 
    #               manipulation = TRUE,
                  nodesIdSelection = TRUE) %>%
        visInteraction(navigationButtons = TRUE) %>%
         visLegend() %>%
      visIgraphLayout(layout ="layout.fruchterman.reingold",smooth = TRUE)

net
 return(net)
 } 

```


## Application



```{r}
# Load complete hypercube
hc <- readRDS("data/corpus/hc_mycorpus_states_pand.RDS")
hc <- hc %>% filter(where1 !="_no_", where2 != "_no_")

# Eliminate non foreign news 
hc<-hc[hc$where1 != substr(who,4,6),]
hc<-hc[hc$where2 != substr(who,4,6),]

# Add complete labels
map<-readRDS("data/dico_states/world_map_4326.Rdata")
labs<-st_drop_geometry(map)
labs<-labs[,c(1,4)]

# Shorten the name of USA
labs$NAME[labs$ISO3=="USA"]<-"U.S.A."


names(labs)<-c("where1","geofr1")
hc<-left_join(hc,labs)
names(labs)<-c("where2","geofr2")
hc<-left_join(hc,labs)

hc_geo_geo <- hc
```


### Reference network



```{r}
hc<-hc_geo_geo 



hc<-hc_filter(don = hc,
                             wgt = "news",
                             where1 = "geofr1",
                             where2 = "geofr2",
                             where1_exc = c("_no_"),
                             where2_exc = c("_no_"),
                             self = FALSE
                           )

int <- build_int(don = hc,
                 s1=2,
                 s2=2,
                 n1=1,
                 n2=1,
                 k=0)

mod<-rand_int(int,
              resid = TRUE,
              diag = FALSE)


network<- geo_network(mod,
                      size = "Fij",
                      minsize = 1,
                      test = "Rchi_ij",
                      mintest = 3.84)
network

```


